import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
#Importing the important library
df=pd.read_excel('file:///C:/Users/Ritesh/Desktop/Notes/Datasets%20for%20Python/NBFC Loan Transaction Data.xlsx')
#Loading of the data
df.head()
#First 5 records
print(df['AUTHORIZATIONDATE'].min())
print(df['AUTHORIZATIONDATE'].max())
print(df['LAST_RECEIPT_DATE'].min())
print(df['LAST_RECEIPT_DATE'].max())
#We can see that there are total 53 columns. And looking atthe various columns we can see that there are null values present
#in certain columns. More over looking at datatype of the column , datatype looks good.
df.info()
df.shape
pd.options.display.max_columns = None
#To display all the columns.
df.describe().T
df[df['LOAN_AMT'] - df['NET_DISBURSED_AMT'] > 0].head(10)
PRE_EMI_DUEAMT= PRE_EMI_RECEIVED_AMT+PRE_EMI_OS_AMOUNT
OUTSTANDING_PRINCIPAL+ PAID_PRINCIPAL = LOAN_AMT
BALANCE_TENURE + COMPLETED_TENURE = CURRENT_TENOR
ORIGNAL_TENOR - CURRENT_TENOR = DIFF_ORIGINAL_CURRENT_TENOR
EMI_OS_AMOUNT - BALANCE_EXCESS = NET_RECEIVABLE
PRE_EMI_DUEAMT - PRE_EMI_OS_AMOUNT = PRE_EMI_RECEIVED_AMT
INTEREST_START_DATE - AUTHORIZATIONDATE = DIFF_AUTH_INT_DATE
CURRENT_INTEREST_RATE_MAX - CURRENT_INTEREST_RATE_MIN = DIFF_CURRENT_INTEREST_RATE_MAX_MIN
MAX_EMI_AMOUNT - MIN_EMI_AMOUNT = DIFF_EMI_AMOUNT_MAX_MIN
ORIGNAL_INTEREST_RATE - CURRENT_INTEREST_RATE = DIFF_ORIGINAL_CURRENT_INTEREST_RATE
BALANCE_TENURE,COMPLETED_TENURE,CURRENT_TENOR,DIFF_ORIGINAL_CURRENT_TENOR,ORIGNAL_TENOR
df[['BALANCE_TENURE','COMPLETED_TENURE','CURRENT_TENOR','DIFF_ORIGINAL_CURRENT_TENOR','ORIGNAL_TENOR']].head()
df[['EMI_OS_AMOUNT','BALANCE_EXCESS','NET_RECEIVABLE']].head()
df[['EMI_DUEAMT','EMI_OS_AMOUNT','EMI_RECEIVED_AMT']].head()
df[['DIFF_AUTH_INT_DATE','AUTHORIZATIONDATE','INTEREST_START_DATE']].head()
df[['MAX_EMI_AMOUNT','MIN_EMI_AMOUNT','DIFF_EMI_AMOUNT_MAX_MIN']].head()
df[['CURRENT_INTEREST_RATE_MAX','CURRENT_INTEREST_RATE_MIN','DIFF_CURRENT_INTEREST_RATE_MAX_MIN']].head()
df[['ORIGNAL_INTEREST_RATE','CURRENT_INTEREST_RATE','DIFF_ORIGINAL_CURRENT_INTEREST_RATE']].head()
df[['BALANCE_EXCESS','EXCESS_AVAILABLE','EXCESS_ADJUSTED_AMT']].head(10)
(df['COMPLETED_TENURE'] + df['DIFF_ORIGINAL_CURRENT_TENOR']).head()
df[df['PRE_EMI_DUEAMT'] - df['PRE_EMI_RECEIVED_AMT'] > 0][['PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT','PRE_EMI_RECEIVED_AMT']].head()
df.shape
df.isnull().sum()
#Null values are present.NPA_IN_LAST_MONTH & NPA_IN_CURRENT_MONTH are two columns having excess null values. And it would
#be better to drop them off.
df.duplicated().sum()
#Theres no duplicate records
df.drop(['NPA_IN_LAST_MONTH','NPA_IN_CURRENT_MONTH'],axis=1,inplace=True)
#Dropping these column as it has lot of null values
df.dropna(inplace=True)
#Dropping the rows having null values in it
df.isnull().sum()
#NO NULL VALUES ARE PRESENT
df['AGREEMENTID'].nunique()
#We can drop this unique identifier column as this will not be used for our analysis.
df.shape
df.drop(['AGREEMENTID'],axis=1,inplace=True)
# Dropping of unique column.
df.head()
df['FORECLOSURE'].value_counts()
sns.barplot(x=df['FORECLOSURE'].value_counts().index,y=df['FORECLOSURE'].value_counts())
df['FORECLOSURE'].value_counts(normalize=True)
sns.barplot(df.groupby(df['FORECLOSURE'])['BALANCE_EXCESS'].mean().index,df.groupby(df['FORECLOSURE'])['BALANCE_EXCESS'].mean().values)
plt.ylabel('Mean Balance Excess')
#Balance excess is higher in case of foreclosure. As the collaterals are being auctioned to pay off loans
# theres is higher probability of getting the amount greater than the outstanding amount.
df['Percentage_Completion'] = (df['COMPLETED_TENURE']/df['CURRENT_TENOR']) *100
#Let us add a new column Percentage_Completion
df.head()
df.describe()
sns.barplot(df.groupby(df['FORECLOSURE'])['Percentage_Completion'].mean().index,df.groupby(df['FORECLOSURE'])['Percentage_Completion'].mean().values)
plt.ylabel('Mean Percentage Completion')
# This is a interesting insights. Person/Company which are foreclosuring has higher percentage completion rate. Which somehow
# shows us that people are not defaulting willfully but there is some reason due to which this is happening.
sns.barplot(df.groupby(df['FORECLOSURE'])['BALANCE_TENURE'].mean().index,df.groupby(df['FORECLOSURE'])['BALANCE_TENURE'].mean().values)
plt.ylabel('Mean Balance Tenure')
# Mean completed tenure is more for person getting foreclosed.
df.groupby(df['FORECLOSURE'])['CURRENT_TENOR'].mean()
City_counts=pd.DataFrame(df.groupby(['FORECLOSURE','CITY'])['CUSTOMERID'].count().sort_values(ascending=False))
City_counts=City_counts.reset_index()
City_counts[City_counts['FORECLOSURE']==0].head()
City_counts[City_counts['FORECLOSURE']==1].head()
sns.barplot(x=df.groupby(['FORECLOSURE'])['COMPLETED_TENURE'].mean().index,y=df.groupby(['FORECLOSURE'])['COMPLETED_TENURE'].mean().values)
plt.xlabel("Completed Tenure")
sns.barplot(np.array(City_counts[City_counts['FORECLOSURE']==1]['CITY'].head()),City_counts[City_counts['FORECLOSURE']==1]['CUSTOMERID'].head().values)
plt.xlabel('City with most number of FORECLOSURES')
# MUMBAI is the city recording highest number of foreclosure. Followed by Hyderabad. Closure look has to be given on these
#cities.
sns.barplot(np.array(City_counts[City_counts['FORECLOSURE']==0]['CITY'].head()),City_counts[City_counts['FORECLOSURE']==0]['CUSTOMERID'].head().values)
plt.xlabel('City with most number of non-FORECLOSURES')
# MUMBAI is the city recording highest number of non-foreclosure case as well . Followed by Hyderabad.
sns.barplot(df['CITY'].value_counts().index[:5],df['CITY'].value_counts().values[:5])
plt.xlabel('Cities with maximum number of loan given')
# MUMBAI AND HYDERABAD gave the highest number of loan.
PRODUCT_STATUS=pd.DataFrame(df.groupby(['FORECLOSURE','PRODUCT'])['CITY'].count()).reset_index()
sns.barplot(x=PRODUCT_STATUS[PRODUCT_STATUS['FORECLOSURE'] == 0]['PRODUCT'],y=PRODUCT_STATUS[PRODUCT_STATUS['FORECLOSURE'] == 0]['CITY'])
plt.xlabel('Products count for non-foreclosure account')
plt.ylabel(' Counts ')
sns.barplot(x=PRODUCT_STATUS[PRODUCT_STATUS['FORECLOSURE'] == 1]['PRODUCT'],y=PRODUCT_STATUS[PRODUCT_STATUS['FORECLOSURE'] == 1]['CITY'])
plt.xlabel('Products for foreclosure account')
plt.ylabel(' Counts ')
# Most of the foreclosure is seen for the PRODUCT - HL followed by STHL . A thorough inspection of customer should be
# performed for giving away loans in the category - HL and STHL
sns.barplot(df['PRODUCT'].value_counts().index,df['PRODUCT'].value_counts().values)
plt.xlabel('Product Codes')
plt.ylabel(' Product Counts ')
#Maximum number of loans has been disbursed as Short Term Home Loan.
df['MOB'].value_counts(ascending=False).head()
sns.barplot(df['MOB'].value_counts(ascending=False).head().index,df['MOB'].value_counts(ascending=False).head())
plt.xlabel('Most popular code')
# The most popular code is coming as 9 followed by 7 and 12
plt.figure(figsize=(10,5))
sns.barplot(df['PRODUCT'], df['LOAN_AMT'],hue=df['FORECLOSURE'])
# The mean value of loan amount for Product LAP is highest. Moreover those person getting foreclosed has highest mean loan
# amount compared to others.
plt.figure(figsize=(8,5))
sns.barplot(df['PRODUCT'], df['LOAN_AMT'])
sns.barplot(df['FORECLOSURE'], df['NET_LTV'])
# Net Loan to Value ratio looks same for both the case.
df['NET_LTV_RANGE']=pd.cut(df['NET_LTV'], [0,10,20,30,40,50,60,70,80,90,100], include_lowest=True)
#Introducing a new column NET_LTV_RANGE to understand under which bracket the NET_LTV is falling . This may help us to
#get some new insights.
df[['NET_LTV','NET_LTV_RANGE']].head()
plt.figure(figsize=(10,5))
sns.countplot(y=df['NET_LTV_RANGE'])
# We can see that most number of loans has been given to those having NET_LTV_RANGE between 40 to 70.
plt.figure(figsize=(10,5))
sns.barplot(y=df[df['FORECLOSURE'] == 1]['NET_LTV_RANGE'].value_counts().index , x=df[df['FORECLOSURE'] == 1]['NET_LTV_RANGE'].value_counts().values)
plt.xlabel('Counts corresponding to FORECLOSURE')
# We can see that most number of foreclosure has happened when NET_LTV_RANGE between 70 to 80.
# Bank should try to give loan at safer side in the range od 30 to 50 . As most of the foreclosure is taking place around
# 50-80
plt.figure(figsize=(10,5))
sns.countplot(y=df['NET_LTV_RANGE'],hue=df['FORECLOSURE'])
plt.xlabel(' Respective counts corresponding to FORECLOSURE & NON-FORECLOSURE')
df['SCHEMEID'].value_counts().head()
plt.figure(figsize=(10,5))
sns.barplot(x=df['SCHEMEID'].value_counts().head(5).index , y=df['SCHEMEID'].value_counts().head().values)
plt.xlabel('Most popular scheme')
plt.figure(figsize=(10,5))
sns.barplot(x=df[df['FORECLOSURE'] == 1]['SCHEMEID'].value_counts(ascending=False).head(5).index , y=df[df['FORECLOSURE'] == 1]['SCHEMEID'].value_counts(ascending=False).head(5).values)
plt.xlabel(' Top 5 Schemes resulting in maximum foreclosure')
plt.figure(figsize=(10,5))
sns.barplot(x=df[df['FORECLOSURE'] == 0]['SCHEMEID'].value_counts(ascending=False).head(5).index , y=df[df['FORECLOSURE'] == 0]['SCHEMEID'].value_counts(ascending=False).head(5).values)
plt.xlabel('Top 5 Schemes having maximum number of non-foreclosure case')
df['SCHEMEID'].value_counts(ascending=True).head(48)
#Least popular scheme id
plt.figure(figsize=(8,5))
sns.barplot(df['MOB'].value_counts(ascending=False).head().index,df['MOB'].value_counts(ascending=False).head())
plt.xlabel('Most popular Internal code')
plt.figure(figsize=(8,5))
sns.barplot(x=df[df['FORECLOSURE'] == 1]['MOB'].value_counts(ascending=False).head(5).index , y=df[df['FORECLOSURE'] == 1]['MOB'].value_counts(ascending=False).head(5).values)
plt.xlabel('Top 5 Internal code having maximum number of foreclosure case')
plt.figure(figsize=(8,5))
sns.barplot(x=df[df['FORECLOSURE'] == 0]['MOB'].value_counts(ascending=False).head(5).index , y=df[df['FORECLOSURE'] == 0]['MOB'].value_counts(ascending=False).head(5).values)
plt.xlabel('Top 5 Internal code having maximum number of non-foreclosure case')
df['FOIR_Range']=pd.cut(df['FOIR'], [0,0.20,0.40,0.60,0.80,1,5,10,15,20,25,30,35], include_lowest=True)
df.isnull().sum()
#We can clearly see that these records for FOIR column in incorrectly inserted as FOIR cannot be negative and cannot have
#such high values. Hence we will going to drop these 2 rows.
df[df['FOIR_Range'].isnull()][['FOIR','FOIR_Range']]
df.dropna(inplace=True)
df[df['FOIR_Range'].isnull()][['FOIR','FOIR_Range']]
df['FOIR_Range'].value_counts()
plt.figure(figsize=(8,5))
sns.countplot(y=df['FOIR_Range'])
plt.xlabel('Total counts for a specific range ')
plt.figure(figsize=(8,5))
sns.barplot(y=df[df['FORECLOSURE'] == 1]['FOIR_Range'].value_counts(ascending=False).head(5).index , x=df[df['FORECLOSURE'] == 1]['FOIR_Range'].value_counts(ascending=False).head(5).values)
plt.xlabel('FORECLOSURE counts for a specific range ')
FOIR_RATIO = (df[df['FORECLOSURE'] == 1]['FOIR_Range'].value_counts())/(df['FOIR_Range'].value_counts())
FOIR_RATIO
#Creating a new feature comprising of the ratio between FORECLOSURE counts in a specific range to the Total counts
# in that range
plt.figure(figsize=(8,5))
sns.barplot(y=FOIR_RATIO.index,x=FOIR_RATIO.values)
plt.xlabel('Ratio between FORECLOSURE counts in a specific range to the Total counts in that range ')
#We can very well observed that whenever the FOIR ratio exceeded 80 % , the count of foreclosue is drastically increased.
#Moreover in those case when the FOIR ratio is less than 20 percent has also greater count of foreclosure. This may be the
# case of faking oneself income or defaulting willfully and hence the closure look need to be given upen this.
df.head()
sns.countplot(x=df[df['FORECLOSURE'] == 0]['CURRENT_INTEREST_RATE_CHANGES'])
plt.xlabel('Effect of CURRENT_INTEREST_RATE_CHANGES on Non-Foreclosure')
sns.countplot(x=df[df['FORECLOSURE'] == 1]['CURRENT_INTEREST_RATE_CHANGES'])
plt.xlabel('Effect of CURRENT_INTEREST_RATE_CHANGES on Foreclosure')
#Current interest rate changes does not have any effect on foreclosure
df['Mean_Interest_Rate'] =(df['CURRENT_INTEREST_RATE_MAX'] + df['CURRENT_INTEREST_RATE'] + df['CURRENT_INTEREST_RATE_MIN'])/3
sns.barplot(y=df['Mean_Interest_Rate'],x=df['FORECLOSURE'])
# People getting foreclosed have higher mean interest rate. Although the difference is not that great. But still it can
# have positive impact on increasing case of foreclosure
df.groupby(['FORECLOSURE'])['Mean_Interest_Rate'].mean()
sns.barplot(y=df['LOAN_AMT'],x=df['FORECLOSURE'])
# People getting foreclosed have higher loan amount sanctioned. Greater the loan amount
# higher the chance of foreclosure
sns.barplot(y=df['ORIGNAL_TENOR'],x=df['FORECLOSURE'])
# People having higher original tenor results in higher the chance of foreclosure
sns.barplot(y=df['COMPLETED_TENURE'],x=df['FORECLOSURE'])
#Person getting foreclosed have larger completed tenure
sns.barplot(y=df['DPD'],x=df['FORECLOSURE'])
plt.ylabel('Days past due')
#Days past due is an important factor for FORECLOSURE. People getting foreclosed have higher units of DPD
sns.barplot(y=df['DIFF_ORIGINAL_CURRENT_INTEREST_RATE'],x=df['FORECLOSURE'])
# Difference between Original and Current interest rate is insignificant incase of Foreclosure.
df.groupby(['FORECLOSURE'])['DIFF_ORIGINAL_CURRENT_INTEREST_RATE'].mean()
df['MEAN_EMI_AMOUNT'] =(df['EMI_AMOUNT'] + df['MAX_EMI_AMOUNT'] + df['MIN_EMI_AMOUNT'])/3
sns.barplot(y=df['MEAN_EMI_AMOUNT'],x=df['FORECLOSURE'])
# Person getting foreclosed are paying higher mean_emi_amount.
sns.barplot(y=df['DIFF_CURRENT_INTEREST_RATE_MAX_MIN'],x=df['FORECLOSURE'])
plt.figure(figsize=(12,8))
sns.scatterplot(x='ORIGNAL_TENOR',y='MEAN_EMI_AMOUNT',data=df,hue='FORECLOSURE')
#We can see that most of the foreclosure case is seen in those case when the Original Tenor is exceeds 180 months of time
# frame . Also in maximum case MEAN_EMI_AMOUNT values lies below 15000000 .
print(df[df['ORIGNAL_TENOR'] < 180][df['FORECLOSURE'] == 1].shape)
print(df[df['ORIGNAL_TENOR'] < 190][df['FORECLOSURE'] == 1].shape)
df.head()
plt.figure(figsize=(10,5))
sns.barplot(x=df.groupby(df['AUTHORIZATIONDATE'].dt.year)['NET_DISBURSED_AMT'].sum().index,y=df.groupby(df['AUTHORIZATIONDATE'].dt.year)['NET_DISBURSED_AMT'].sum().values)
plt.xlabel('Sum of Net Disbursed Amount each year')
#THere has been a tremendous increase in disbursed loan amount in 2017 and 2018 .A closer look must be given to find out the
# the reason behind it. Such a drastic increase in loan disbursment may be due to increase in giving substandard loan.
plt.figure(figsize=(10,5))
sns.barplot(x=df.groupby(df['AUTHORIZATIONDATE'].dt.year)['NET_DISBURSED_AMT'].count().index,y=df.groupby(df['AUTHORIZATIONDATE'].dt.year)['NET_DISBURSED_AMT'].count().values)
plt.xlabel(' Count each year')
# The customer counts after 2016 has increased significantly which may be the reason for increase in value of loan disbursed.
plt.figure(figsize=(10,5))
sns.barplot(x=df.groupby(df['AUTHORIZATIONDATE'].dt.month)['NET_DISBURSED_AMT'].sum().index,y=df.groupby(df['AUTHORIZATIONDATE'].dt.month)['NET_DISBURSED_AMT'].sum().values)
plt.xlabel('Sum of Net Disbursed Amount each year')
# In the month of March the maximum amount of disbursment has taken place.
plt.figure(figsize=(10,5))
sns.barplot(x=df.groupby(df['AUTHORIZATIONDATE'].dt.month)['NET_DISBURSED_AMT'].count().index,y=df.groupby(df['AUTHORIZATIONDATE'].dt.month)['NET_DISBURSED_AMT'].count().values)
plt.xlabel('Sum of Net Disbursed Amount each year')
# Customer counts goes similar to the sum of the loan disbursed each month.
plt.figure(figsize=(10,5))
sns.barplot(x=df[df['FORECLOSURE'] == 1].groupby(df['LAST_RECEIPT_DATE'].dt.year)['FORECLOSURE'].count().index,y=df[df['FORECLOSURE'] == 1].groupby(df['LAST_RECEIPT_DATE'].dt.year)['FORECLOSURE'].count().values)
plt.xlabel(' Year performance in case of Forelosure')
# Year 2017 has seen the most number of foreclosure. If we assume this dataset belongs to the companies from India ,
# demonitisation may have played a huge role in increase in foreclosure.
plt.figure(figsize=(10,5))
sns.barplot(x=df[df['FORECLOSURE'] == 1].groupby(df['LAST_RECEIPT_DATE'].dt.month)['FORECLOSURE'].count().index,y=df[df['FORECLOSURE'] == 1].groupby(df['LAST_RECEIPT_DATE'].dt.month)['FORECLOSURE'].count().values)
plt.xlabel(' Year performance in case of Forelosure')
# No visible pattern is seen . Nothing can be inferred in this case.
plt.figure(figsize=(8,5))
sns.barplot(x=df[df['FORECLOSURE'] == 1].groupby(df['LATEST_TRANSACTION_MONTH'])['FORECLOSURE'].count().index,y=df[df['FORECLOSURE'] == 1].groupby(df['LATEST_TRANSACTION_MONTH'])['FORECLOSURE'].count().values)
plt.xlabel(' LATEST TRANSACTION MONTH in case of Forelosure')
plt.figure(figsize=(8,5))
sns.barplot(x=df[df['FORECLOSURE'] == 0].groupby(df['LATEST_TRANSACTION_MONTH'])['FORECLOSURE'].count().index,y=df[df['FORECLOSURE'] == 0].groupby(df['LATEST_TRANSACTION_MONTH'])['FORECLOSURE'].count().values)
plt.xlabel(' LATEST TRANSACTION MONTH in case of Non - Forelosure')
plt.figure(figsize=(8,5))
sns.countplot(df['LATEST_TRANSACTION_MONTH'] , hue=df['FORECLOSURE'])
plt.xlabel(' LATEST TRANSACTION MONTH in case of Forelosure')
df.head()
df_1=pd.concat([df,pd.get_dummies(df['PRODUCT'])],axis=1)
#One hot encoding for Product Column
df_1.head(5)
df_1.shape
df_1.drop(['PRODUCT'],axis=1,inplace=True)
# We can drop this PRODUCT column as it has already been taken care in form of One hot encoding
df[['BALANCE_EXCESS', 'BALANCE_TENURE','COMPLETED_TENURE', 'CURRENT_INTEREST_RATE', \
'CURRENT_INTEREST_RATE_MAX', 'CURRENT_INTEREST_RATE_MIN','CURRENT_INTEREST_RATE_CHANGES', 'CURRENT_TENOR',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN', 'DIFF_EMI_AMOUNT_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT','EMI_RECEIVED_AMT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', \
'LATEST_TRANSACTION_MONTH', 'LOAN_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'MONTHOPENING', 'NET_DISBURSED_AMT', 'NET_LTV',
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS','ORIGNAL_INTEREST_RATE', 'ORIGNAL_TENOR',\
'OUTSTANDING_PRINCIPAL','PAID_INTEREST', 'PAID_PRINCIPAL', 'PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT', 'PRE_EMI_RECEIVED_AMT','Percentage_Completion',\
'Mean_Interest_Rate','MEAN_EMI_AMOUNT' ]].corr()
plt.figure(figsize=(20,20))
sns.heatmap(data=df[['BALANCE_EXCESS', 'BALANCE_TENURE','COMPLETED_TENURE', 'CURRENT_INTEREST_RATE', \
'CURRENT_INTEREST_RATE_MAX', 'CURRENT_INTEREST_RATE_MIN','CURRENT_INTEREST_RATE_CHANGES', 'CURRENT_TENOR',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN', 'DIFF_EMI_AMOUNT_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT','EMI_RECEIVED_AMT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', \
'LATEST_TRANSACTION_MONTH', 'LOAN_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'MONTHOPENING', 'NET_DISBURSED_AMT', 'NET_LTV',
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS','ORIGNAL_INTEREST_RATE', 'ORIGNAL_TENOR',\
'OUTSTANDING_PRINCIPAL','PAID_INTEREST', 'PAID_PRINCIPAL', 'PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT', 'PRE_EMI_RECEIVED_AMT','Percentage_Completion',\
'Mean_Interest_Rate','MEAN_EMI_AMOUNT']].corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})
# Multicollinearity is present in lot of variables . Hence we will have to intelligently drop some of the highly correlated
# variables.
df_1.head()
PRE_EMI_DUEAMT= PRE_EMI_RECEIVED_AMT+PRE_EMI_OS_AMOUNT
OUTSTANDING_PRINCIPAL+ PAID_PRINCIPAL = LOAN_AMT
BALANCE_TENURE + COMPLETED_TENURE = CURRENT_TENOR
ORIGNAL_TENOR - CURRENT_TENOR = DIFF_ORIGINAL_CURRENT_TENOR
EMI_OS_AMOUNT - BALANCE_EXCESS = NET_RECEIVABLE
PRE_EMI_DUEAMT - PRE_EMI_OS_AMOUNT = PRE_EMI_RECEIVED_AMT
INTEREST_START_DATE - AUTHORIZATIONDATE = DIFF_AUTH_INT_DATE
CURRENT_INTEREST_RATE_MAX - CURRENT_INTEREST_RATE_MIN = DIFF_CURRENT_INTEREST_RATE_MAX_MIN
MAX_EMI_AMOUNT - MIN_EMI_AMOUNT = DIFF_EMI_AMOUNT_MAX_MIN
ORIGNAL_INTEREST_RATE - CURRENT_INTEREST_RATE = DIFF_ORIGINAL_CURRENT_INTEREST_RATE
BALANCE_TENURE,COMPLETED_TENURE,CURRENT_TENOR,DIFF_ORIGINAL_CURRENT_TENOR,ORIGNAL_TENOR
Mean_Interest_Rate = CURRENT_INTEREST_RATE_MAX + CURRENT_INTEREST_RATE + CURRENT_INTEREST_RATE_MIN
# As Mean_Interest_Rate column is derived from CURRENT_INTEREST_RATE_MAX,CURRENT_INTEREST_RATE,CURRENT_INTEREST_RATE_MIN
# column , we can drop these 3 columns with respect to feature engineering.
df_1.drop(['CURRENT_INTEREST_RATE','CURRENT_INTEREST_RATE_MAX','CURRENT_INTEREST_RATE_MIN'],axis=1,inplace=True)
df_1.head()
df[['BALANCE_TENURE','COMPLETED_TENURE','CURRENT_TENOR','ORIGNAL_TENOR']].head()
# We can see that BALANCE_TENURE,COMPLETED_TENURE,CURRENT_TENOR are related with each other. Moreover Balance Tenur
# and Current Tenore are highly correlated with each other. Hence we will going to drop 1 column .Here I am dropping Current
# TENOR
df_1.drop(['CURRENT_TENOR'],axis=1,inplace=True)
plt.figure(figsize=(20,20))
sns.heatmap(data=df_1[['BALANCE_EXCESS', 'BALANCE_TENURE','COMPLETED_TENURE','CURRENT_INTEREST_RATE_CHANGES','Mean_Interest_Rate',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN', 'DIFF_EMI_AMOUNT_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT','EMI_RECEIVED_AMT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', \
'LATEST_TRANSACTION_MONTH', 'LOAN_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'MONTHOPENING', 'NET_DISBURSED_AMT', 'NET_LTV',
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS','ORIGNAL_INTEREST_RATE', 'ORIGNAL_TENOR',\
'OUTSTANDING_PRINCIPAL','PAID_INTEREST', 'PAID_PRINCIPAL', 'PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT', 'PRE_EMI_RECEIVED_AMT']].corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})
df_1.drop(['ORIGNAL_INTEREST_RATE'],axis=1,inplace=True)
#As the ORIGNAL_INTEREST_RATE is nicely captured by Mean_Interest_Rate , hence we will be dropping ORIGNAL_INTEREST_RATE.
df_1.drop(['DIFF_EMI_AMOUNT_MAX_MIN'],axis=1,inplace=True)
# DIFF_EMI_AMOUNT_MAX_MIN is highly correlated with MAX_EMI_AMOUNT.Hence we will be dropping DIFF_EMI_AMOUNT_MAX_MIN column.
plt.figure(figsize=(20,20))
sns.heatmap(data=df_1[['BALANCE_EXCESS', 'BALANCE_TENURE','COMPLETED_TENURE','CURRENT_INTEREST_RATE_CHANGES','Mean_Interest_Rate',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT','EMI_RECEIVED_AMT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', \
'LATEST_TRANSACTION_MONTH', 'LOAN_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'MONTHOPENING', 'NET_DISBURSED_AMT', 'NET_LTV',
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS', 'ORIGNAL_TENOR',\
'OUTSTANDING_PRINCIPAL','PAID_INTEREST', 'PAID_PRINCIPAL', 'PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT']].corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})
df_1[(df_1['PRE_EMI_DUEAMT'] - df_1['PRE_EMI_RECEIVED_AMT'])>0][['PRE_EMI_DUEAMT','PRE_EMI_RECEIVED_AMT']].shape
df_1.drop(['EMI_RECEIVED_AMT'],axis=1,inplace=True)
# PRE_EMI_DUEAMT is somewhat equal to PRE_EMI_RECEIVED_AMT .So both the variables are highly correlated. Moreover from
# business point of view if PRE_EMI_RECEIVED_AMT should be same as that of PRE_EMI_DUEAMT.If not person may start defaulting
# Hence we will going to drop EMI_RECEIVED_AMT column
df_1.drop(['PRE_EMI_RECEIVED_AMT'],axis=1,inplace=True)
# Similarly PRE_EMI_RECEIVED_AMT is highly correlated PRE_EMI_DUEAMT . Hence we will be dropping PRE_EMI_DUEAMT column
plt.figure(figsize=(20,20))
sns.heatmap(data=df_1[['BALANCE_EXCESS', 'BALANCE_TENURE','COMPLETED_TENURE','CURRENT_INTEREST_RATE_CHANGES','Mean_Interest_Rate',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', \
'LATEST_TRANSACTION_MONTH', 'LOAN_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'MONTHOPENING', 'NET_DISBURSED_AMT', 'NET_LTV',
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS', 'ORIGNAL_TENOR',\
'OUTSTANDING_PRINCIPAL','PAID_INTEREST', 'PAID_PRINCIPAL', 'PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT']].corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})
df_1.head()
sns.countplot(df_1['LAP'])
sns.countplot(df_1['STHL'])
df_1['PAID_AMOUNT'] =df_1['PAID_PRINCIPAL'] + df_1['PAID_INTEREST']
# We can add PAID_PRINCIPAL and PAID_INTEREST column to get new column ,PAID_AMOUNT. This can help us to drop first 2 column
# used.
PRE_EMI_DUEAMT= PRE_EMI_RECEIVED_AMT ## +PRE_EMI_OS_AMOUNT
OUTSTANDING_PRINCIPAL+ PAID_PRINCIPAL ## = LOAN_AMT
BALANCE_TENURE + COMPLETED_TENURE = ## CURRENT_TENOR
ORIGNAL_TENOR - CURRENT_TENOR = DIFF_ORIGINAL_CURRENT_TENOR
EMI_OS_AMOUNT - BALANCE_EXCESS = NET_RECEIVABLE
PRE_EMI_DUEAMT - PRE_EMI_OS_AMOUNT = PRE_EMI_RECEIVED_AMT
INTEREST_START_DATE - AUTHORIZATIONDATE = DIFF_AUTH_INT_DATE
CURRENT_INTEREST_RATE_MAX - CURRENT_INTEREST_RATE_MIN = DIFF_CURRENT_INTEREST_RATE_MAX_MIN
MAX_EMI_AMOUNT - MIN_EMI_AMOUNT = ##DIFF_EMI_AMOUNT_MAX_MIN
ORIGNAL_INTEREST_RATE ## - CURRENT_INTEREST_RATE = DIFF_ORIGINAL_CURRENT_INTEREST_RATE
BALANCE_TENURE,COMPLETED_TENURE,CURRENT_TENOR,DIFF_ORIGINAL_CURRENT_TENOR,ORIGNAL_TENOR
Mean_Interest_Rate = #CURRENT_INTEREST_RATE_MAX + CURRENT_INTEREST_RATE + CURRENT_INTEREST_RATE_MIN
'CURRENT_INTEREST_RATE','CURRENT_INTEREST_RATE_MAX','CURRENT_INTEREST_RATE_MIN'
df_1.drop(['PAID_PRINCIPAL','PAID_INTEREST'],axis=1,inplace=True)
df[df['LOAN_AMT'] -df['NET_DISBURSED_AMT'] > 0][['LOAN_AMT','NET_DISBURSED_AMT']].shape
df_1.drop(['LOAN_AMT'],axis=1,inplace=True) #Net disbursed amount is somewhat same as that of loan_amt . As we know the
# all the interest calculation is done on the NET_DISBURSED_AMT and not on LOAN_AMOUNT . Hence for calculation and model
# development purpose , we will keep NET_DISBURSED_AMT and drop LOAN_AMT
plt.figure(figsize=(20,20))
sns.heatmap(data=df_1[['BALANCE_EXCESS', 'BALANCE_TENURE','COMPLETED_TENURE','CURRENT_INTEREST_RATE_CHANGES','Mean_Interest_Rate',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', \
'LATEST_TRANSACTION_MONTH', 'NET_DISBURSED_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'MONTHOPENING', 'NET_LTV',
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS', 'ORIGNAL_TENOR',\
'OUTSTANDING_PRINCIPAL','PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT','PAID_AMOUNT']].corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})
# Still there are multicollinearity present for certain variables.
df_2=df_1.copy()
# Taking a copy of df_1
df_2.drop(['MONTHOPENING'],axis=1,inplace=True)
# MONTHOPENING is not usefull for our analysis. Hence we will be dropping it
plt.figure(figsize=(20,20))
sns.heatmap(data=df_2[['BALANCE_EXCESS','BALANCE_TENURE','COMPLETED_TENURE','CURRENT_INTEREST_RATE_CHANGES','Mean_Interest_Rate',\
'DIFF_AUTH_INT_DATE','DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'DIFF_ORIGINAL_CURRENT_TENOR','DPD', 'DUEDAY', 'EMI_AMOUNT', 'EMI_DUEAMT', 'EMI_OS_AMOUNT',\
'EXCESS_ADJUSTED_AMT', 'EXCESS_AVAILABLE', 'FOIR','LAST_RECEIPT_AMOUNT', 'OUTSTANDING_PRINCIPAL',\
'LATEST_TRANSACTION_MONTH', 'NET_DISBURSED_AMT', 'MAX_EMI_AMOUNT','MIN_EMI_AMOUNT', 'NET_LTV',\
'NET_RECEIVABLE', 'NUM_EMI_CHANGES', 'NUM_LOW_FREQ_TRANSACTIONS', 'ORIGNAL_TENOR','PRE_EMI_DUEAMT','PRE_EMI_OS_AMOUNT','PAID_AMOUNT']].corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})
# There are still certain variables which we will have to drop as multicollinearity is still present
# PRE_EMI_OS_AMOUNT 6.928615e+14
# EMI_RECEIVED_AMT 2.144571e+14
df_2.shape
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
df_2=add_constant(df_2)
df_2.head(5)
df_2[df_2['LAST_RECEIPT_DATE'] < df_2['AUTHORIZATIONDATE']].head()[['AUTHORIZATIONDATE','INTEREST_START_DATE','LAST_RECEIPT_DATE']]
# As per business understanding , LAST_RECEIPT_DATE should be greater than AUTHORIZATIONDATE & INTEREST_START_DATE.
# and INTEREST_START_DATE should be greater than or equal to AUTHORIZATIONDATE.
#Whenever INTEREST_START_DATE is less than AUTHORIZATIONDATE we will going to replace INTEREST_START_DATE with
# AUTHORIZATIONDATE
#Whenever LAST_RECEIPT_DATE is less than INTEREST_START_DATE we will going to replace LAST_RECEIPT_DATE with
# INTEREST_START_DATE
df_2['LAST_RECEIPT_DATE'] = np.where(df_2['LAST_RECEIPT_DATE'] < df_2['INTEREST_START_DATE'],\
df_2['INTEREST_START_DATE'],df_2['LAST_RECEIPT_DATE'])
df_2['INTEREST_START_DATE'] = np.where(df_2['INTEREST_START_DATE'] < df_2['AUTHORIZATIONDATE'],\
df_2['AUTHORIZATIONDATE'],df_2['INTEREST_START_DATE'])
df_2[df_2['LAST_RECEIPT_DATE'] < df_2['INTEREST_START_DATE']].head()
df_2['DIFF_LAST_RECEIPT_INT_DATE'] = df_2['LAST_RECEIPT_DATE'] - df_2['INTEREST_START_DATE']
# Addition of new column DIFF_LAST_RECEIPT_INT_DATE which takes difference between LAST_RECEIPT_DATE & INTEREST_START_DATE
df_2['DIFF_LAST_RECEIPT_INT_DATE']=df_2['DIFF_LAST_RECEIPT_INT_DATE'].dt.days
df_2.drop(['AUTHORIZATIONDATE','DIFF_AUTH_INT_DATE','CITY','CUSTOMERID','INTEREST_START_DATE',\
'LAST_RECEIPT_DATE','SCHEMEID','MOB'],axis=1,inplace=True)
# INTEREST_START_DATE,LAST_RECEIPT_DATE has been taken care inform of DIFF_LAST_RECEIPT_INT_DATE column
# AUTHORIZATIONDATE and INTEREST_START_DATE is somewhat same as each other. They will not be used for model development.
# Hence we will going to drop them.
# DIFF_AUTH_INT_DATE will not be useful for understanding. Hence we will going to drop them.
# We will going to drop CITY , SCHEMEID , MOB column as well for model development purpose as they doesn't seems to be
# important for model building.
df_2.drop(['EMI_AMOUNT','MAX_EMI_AMOUNT','MIN_EMI_AMOUNT'],axis=1,inplace=True)
#All these 3 columns have been combined to form a new column Mean_emi_amount .
#(df['MEAN_EMI_AMOUNT'] =(df['EMI_AMOUNT'] + df['MAX_EMI_AMOUNT'] + df['MIN_EMI_AMOUNT'])/3 )
#Hence we will be dropping these 3 columns
df_2.info()
df_2.drop(['NET_LTV_RANGE','FOIR_Range'],axis=1,inplace=True)
pd.Series([variance_inflation_factor(df_2.values, i)
for i in range(df_2.shape[1])],
index=df_2.columns)
# The VIF values are certain columns are very high . We will try to intelligenty feature engineer or drop the columns
# to pull back the VIF within limits
df_2.drop(['PRE_EMI_OS_AMOUNT'],axis=1,inplace=True)
# As VIF values is highest for this column hence we will going to drop it. and by using other columns HL,LAP,STLAP
# we can easily determine the status of STHL column. Hence we will be dropping STHL column
pd.Series([variance_inflation_factor(df_2.values, i)
for i in range(df_2.shape[1])],
index=df_2.columns)
df_2.drop(['HL'],axis=1,inplace=True)
# PRE_EMI_OS_AMOUNT 6.928615e+14
# EMI_RECEIVED_AMT 2.144571e+14
# Looking at the heatmap , we can observe that EMI_RECEIVED_AMT is highly correlated with other variables . But this
# correlation is not seen for PRE_EMI_OS_AMOUNT . Hence we will be dropping EMI_RECEIVED_AMT rather than PRE_EMI_OS_AMOUNT
pd.Series([variance_inflation_factor(df_2.values, i)
for i in range(df_2.shape[1])],
index=df_2.columns)
will help us in performing feature engineering.
PRE_EMI_DUEAMT= PRE_EMI_RECEIVED_AMT ## +PRE_EMI_OS_AMOUNT
OUTSTANDING_PRINCIPAL+ PAID_PRINCIPAL ## = LOAN_AMT
BALANCE_TENURE + COMPLETED_TENURE = ## CURRENT_TENOR
ORIGNAL_TENOR - CURRENT_TENOR = DIFF_ORIGINAL_CURRENT_TENOR
EMI_OS_AMOUNT - BALANCE_EXCESS = NET_RECEIVABLE
PRE_EMI_DUEAMT - PRE_EMI_OS_AMOUNT = PRE_EMI_RECEIVED_AMT
INTEREST_START_DATE - AUTHORIZATIONDATE = DIFF_AUTH_INT_DATE
CURRENT_INTEREST_RATE_MAX - ##CURRENT_INTEREST_RATE_MIN =
DIFF_CURRENT_INTEREST_RATE_MAX_MIN
MAX_EMI_AMOUNT - MIN_EMI_AMOUNT = ##DIFF_EMI_AMOUNT_MAX_MIN
ORIGNAL_INTEREST_RATE ## - CURRENT_INTEREST_RATE = DIFF_ORIGINAL_CURRENT_INTEREST_RATE
BALANCE_TENURE,COMPLETED_TENURE,CURRENT_TENOR,DIFF_ORIGINAL_CURRENT_TENOR,ORIGNAL_TENOR
Mean_Interest_Rate = #CURRENT_INTEREST_RATE_MAX + CURRENT_INTEREST_RATE + CURRENT_INTEREST_RATE_MIN
'CURRENT_INTEREST_RATE','CURRENT_INTEREST_RATE_MAX','CURRENT_INTEREST_RATE_MIN'
pd.Series([variance_inflation_factor(df_2.values, i)
for i in range(df_2.shape[1])],
index=df_2.columns)
df_2.drop(['EXCESS_AVAILABLE'],axis=1,inplace=True)
# We will be dropping EXCESS_AVAILABLE as it is has high correlation with other variable and has infinity VIF values.
pd.Series([variance_inflation_factor(df_2.values, i)
for i in range(df_2.shape[1])],
index=df_2.columns)
df_2.drop(['DIFF_ORIGINAL_CURRENT_TENOR'],axis=1,inplace=True)
pd.Series([variance_inflation_factor(df_2.values, i)
for i in range(df_2.shape[1])],
index=df_2.columns)
df_2.drop(['BALANCE_EXCESS'],axis=1,inplace=True)
pd.Series([variance_inflation_factor(df_2.values, i)
for i in range(df_2.shape[1])],
index=df_2.columns)
df_2.drop(['NET_DISBURSED_AMT'],axis=1,inplace=True)
pd.Series([variance_inflation_factor(df_2.values, i)
for i in range(df_2.shape[1])],
index=df_2.columns)
df_2.drop(['DIFF_LAST_RECEIPT_INT_DATE'],axis=1,inplace=True)
pd.Series([variance_inflation_factor(df_2.values, i)
for i in range(df_2.shape[1])],
index=df_2.columns)
plt.figure(figsize=(20,20))
sns.heatmap(df_2.corr(),annot=True,fmt='0.1g',linewidths=2, linecolor='black',cbar_kws= {'orientation': 'horizontal'})
df_2.head()
df4=df_2.copy()
from sklearn.model_selection import train_test_split
x=df4.drop(['FORECLOSURE'],axis=1)
y=df4.pop('FORECLOSURE')
x_new=x.copy()
y_new=y.copy()
x_train_new,x_test_new,y_train_new,y_test_new=train_test_split(x_new,y_new,test_size=0.3,random_state=0)
x_new_copy=pd.DataFrame()
x_new_copy['LAP']=x_train_new['LAP']
x_new_copy['STHL']=x_train_new['STHL']
x_new_copy['STLAP']=x_train_new['STLAP']
x_new_copy.head(2)
x_new_copy.reset_index(drop=True,inplace=True)
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
x_train_new.drop(['const','LAP','STHL','STLAP'],axis=1,inplace=True)
x_test_new.drop(['const'],axis=1,inplace=True)
x_train_new.shape
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_train_new=pd.DataFrame(ss.fit_transform(x_train_new),columns = x_train_new.columns)
x_train_new.head()
x_test_new.head()
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['BALANCE_TENURE'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['DPD'],ax=ax[0,1])
sns.distplot(x_train_new['COMPLETED_TENURE'],ax=ax[0,2])
sns.distplot(x_train_new['CURRENT_INTEREST_RATE_CHANGES'],ax=ax[1,0])
sns.distplot(x_train_new['DIFF_CURRENT_INTEREST_RATE_MAX_MIN'],ax=ax[1,1])
sns.distplot(x_train_new['DIFF_ORIGINAL_CURRENT_INTEREST_RATE'],ax=ax[1,2])
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['DUEDAY'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['EMI_DUEAMT'],ax=ax[0,1])
sns.distplot(x_train_new['EMI_OS_AMOUNT'],ax=ax[0,2])
sns.distplot(x_train_new['EXCESS_ADJUSTED_AMT'],ax=ax[1,0])
sns.distplot(x_train_new['FOIR'],ax=ax[1,1])
sns.distplot(x_train_new['LAST_RECEIPT_AMOUNT'],ax=ax[1,2])
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(df4['LATEST_TRANSACTION_MONTH'],ax=ax[0,0],kde=True)
sns.distplot(df4['NET_LTV'],ax=ax[0,1])
sns.distplot(df4['NET_RECEIVABLE'],ax=ax[0,2])
sns.distplot(df4['NUM_EMI_CHANGES'],ax=ax[1,0])
sns.distplot(df4['NUM_LOW_FREQ_TRANSACTIONS'],ax=ax[1,1])
sns.distplot(df4['ORIGNAL_TENOR'],ax=ax[1,2])
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['OUTSTANDING_PRINCIPAL'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['PRE_EMI_DUEAMT'],ax=ax[0,1])
sns.distplot(x_train_new['Percentage_Completion'],ax=ax[0,2])
sns.distplot(x_train_new['Mean_Interest_Rate'],ax=ax[1,0])
sns.distplot(x_train_new['MEAN_EMI_AMOUNT'],ax=ax[1,1])
sns.distplot(x_train_new['PAID_AMOUNT'],ax=ax[1,2])
for col in x_train_new.columns:
print("{} column has {} records having zscore greater than 3".format(col,len(x_train_new[x_train_new[col]>3])))
print("{} column has {} records having zscore lesser than -3".format(col,len(x_train_new[x_train_new[col]<-3])))
print("")
for col in x_train_new.columns:
x_train_new[col]=np.where(x_train_new[col] >= 3,3,x_train_new[col])
x_train_new[col]=np.where(x_train_new[col] <= -3,-3,x_train_new[col])
for col in x_train_new.columns:
print("{} column has {} records having zscore greater than 3".format(col,len(x_train_new[x_train_new[col]>3])))
print("{} column has {} records having zscore lesser than -3".format(col,len(x_train_new[x_train_new[col]<-3])))
print("")
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['BALANCE_TENURE'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['DPD'],ax=ax[0,1])
sns.distplot(x_train_new['COMPLETED_TENURE'],ax=ax[0,2])
sns.distplot(x_train_new['CURRENT_INTEREST_RATE_CHANGES'],ax=ax[1,0])
sns.distplot(x_train_new['DIFF_CURRENT_INTEREST_RATE_MAX_MIN'],ax=ax[1,1])
sns.distplot(x_train_new['DIFF_ORIGINAL_CURRENT_INTEREST_RATE'],ax=ax[1,2])
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['DUEDAY'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['EMI_DUEAMT'],ax=ax[0,1])
sns.distplot(x_train_new['EMI_OS_AMOUNT'],ax=ax[0,2])
sns.distplot(x_train_new['EXCESS_ADJUSTED_AMT'],ax=ax[1,0])
sns.distplot(x_train_new['FOIR'],ax=ax[1,1])
sns.distplot(x_train_new['LAST_RECEIPT_AMOUNT'],ax=ax[1,2])
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['LATEST_TRANSACTION_MONTH'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['NET_LTV'],ax=ax[0,1])
sns.distplot(x_train_new['NET_RECEIVABLE'],ax=ax[0,2])
sns.distplot(x_train_new['NUM_EMI_CHANGES'],ax=ax[1,0])
sns.distplot(x_train_new['NUM_LOW_FREQ_TRANSACTIONS'],ax=ax[1,1])
sns.distplot(x_train_new['ORIGNAL_TENOR'],ax=ax[1,2])
f,ax = plt.subplots(2,3,figsize=(18,10),squeeze=True)
sns.distplot(x_train_new['OUTSTANDING_PRINCIPAL'],ax=ax[0,0],kde=True)
sns.distplot(x_train_new['PRE_EMI_DUEAMT'],ax=ax[0,1])
sns.distplot(x_train_new['Percentage_Completion'],ax=ax[0,2])
sns.distplot(x_train_new['Mean_Interest_Rate'],ax=ax[1,0])
sns.distplot(x_train_new['MEAN_EMI_AMOUNT'],ax=ax[1,1])
sns.distplot(x_train_new['PAID_AMOUNT'],ax=ax[1,2])
x_train_new.head()
df_test=x_test_new.copy()
df_test.head(2)
df_test1=pd.DataFrame()
df_test1['LAP']=x_test_new['LAP']
df_test1['STHL']=x_test_new['STHL']
df_test1['STLAP']=x_test_new['STLAP']
df_test1.head(2)
df_test1.reset_index(drop=True,inplace=True)
x_test_new.drop(['LAP','STHL','STLAP'],axis=1,inplace=True)
x_test_new=pd.DataFrame(ss.transform(x_test_new),columns = x_test_new.columns)
x_test_new.head(10)
x_test_new=pd.concat([x_test_new,df_test1],axis=1)
x_test_new.head(2)
x_train_new= pd.concat([x_train_new,x_new_copy],axis=1)
x_test_new.head(2)
x_train_new.head(2)
from sklearn.cluster import KMeans
df_clusters=df_2.copy()
df_clusters.head()
df_clusters.drop(['const'],axis=1,inplace=True)
nbfc_arr=np.array(df_clusters)
wss=[]
for i in range(1,10):
KM = KMeans(n_clusters=i)
KM.fit(nbfc_arr)
wss.append(KM.inertia_)
plt.plot(range(1,10), wss)
k_means=KMeans(n_clusters=3)
k_means.fit(df_clusters)
labels=k_means.labels_
# Taking 3 clusters to take into account.
from sklearn.metrics import silhouette_samples,silhouette_score
silhouette_score(df_clusters,labels)
df_clusters['silhouette_samples']=silhouette_samples(df_clusters,labels)
df_clusters['Kmeans_Cluster'] = labels
df_clusters.head()
df_clusters['Kmeans_Cluster'].value_counts()
df_clusters['Customer_ID'] = df_1['CUSTOMERID']
df_clusters[df_clusters['Kmeans_Cluster'] == 0].describe()
df_clusters[df_clusters['Kmeans_Cluster'] == 2].describe()
# Customers belongs to cluster 2 has better financials. They are those customers which has lesser Days past dues ,Due Day,
# EMI_DUEAMT ,EMI_OS_AMOUNT,FOIR,NET_LTV,OUTSTANDING_PRINCIPAL . They are the ones which has lesser probabilites to
# get foreclosed
# Customers belongs to cluster 0 has poor financials. They are those customers which has more number of Days past dues ,
# Due Day, EMI_DUEAMT ,EMI_OS_AMOUNT,FOIR,NET_LTV,OUTSTANDING_PRINCIPAL .They are the ones which has higher probabilites to
# get foreclosed. Hence NBFC should give high focus to these customers
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression
log_modelCV=LogisticRegressionCV()
log_modelCV=log_modelCV.fit(x_train_new,y_train_new)
log_modelCV.score(x_train_new,y_train_new)
log_modelCV.score(x_test_new,y_test_new)
log_model=LogisticRegression()
log_model=log_model.fit(x_train_new,y_train_new)
log_model.score(x_train_new,y_train_new)
log_model.score(x_test_new,y_test_new)
from sklearn.metrics import classification_report,confusion_matrix
def confusionmatrix(y_actual,y_predict):
sns.heatmap(confusion_matrix(y_actual,y_predict),annot=True,fmt='g')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
def class_report(y_actual,y_predict):
print(classification_report(y_actual,y_predict))
confusionmatrix(y_train_new,log_model.predict(x_train_new))
class_report(y_train_new,log_model.predict(x_train_new))
confusionmatrix(y_test_new,log_model.predict(x_test_new))
class_report(y_test_new,log_model.predict(x_test_new))
confusionmatrix(y_train_new,log_modelCV.predict(x_train_new))
class_report(y_train_new,log_modelCV.predict(x_train_new))
confusionmatrix(y_test_new,log_modelCV.predict(x_test_new))
class_report(y_test_new,log_modelCV.predict(x_test_new))
y_train_new.value_counts()
import imblearn
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=0)
x_train_smote, y_train_smote = sm.fit_resample(x_train_new, y_train_new)
x_train.info()
y_train_smote.value_counts()
log_modelCV_smote=LogisticRegressionCV()
log_modelCV_smote=log_modelCV_smote.fit(x_train_smote,y_train_smote)
print(log_modelCV_smote.score(x_train_smote,y_train_smote))
print(log_modelCV_smote.score(x_test_new,y_test_new))
class_report(y_train_new,log_modelCV_smote.predict(x_train_new))
class_report(y_test_new,log_modelCV_smote.predict(x_test_new))
confusionmatrix(y_train_new,log_modelCV_smote.predict(x_train_new))
confusionmatrix(y_test_new,log_modelCV_smote.predict(x_test_new))
from sklearn.metrics import roc_auc_score,roc_curve
def roc_plot(model_name,x_data,y_lables):
probs1=model_name.predict_proba(x_data)
probs1=probs1[:,1]
print('roc_auc_score -->',roc_auc_score(y_lables,probs1))
fpr,tpr,thresholds=roc_curve(y_lables,probs1)
plt.plot([0,1],[0,1],linestyle='--')
plt.plot(fpr,tpr,marker='.')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC')
plt.show()
#model_name is the name of the model, x_data is the data that needs to be passed and y_labels is the class values
#Graphical representation of the contrast between true positive rates and false-positive rates at various thresholds.
roc_plot(log_modelCV_smote,x_train_new,y_train_new)
roc_plot(log_modelCV_smote,x_test_new,y_test_new)
log_modelCV_smote.coef_[0]
plt.figure(figsize=(10,8))
sns.barplot(x=log_modelCV_smote.coef_[0] ,y=x_test_new.columns)
roc_plot(log_modelCV,x_train_new,y_train_new)
roc_plot(log_modelCV,x_test_new,y_test_new)
from sklearn.svm import SVC
svc_smote=SVC(probability=True)
svc_smote=svc_smote.fit(x_train_smote,y_train_smote)
class_report(y_train_new,svc_smote.predict(x_train_new))
class_report(y_test_new,svc_smote.predict(x_test_new))
confusionmatrix(y_train_new,svc_smote.predict(x_train_new))
confusionmatrix(y_test_new,svc_smote.predict(x_test_new))
roc_plot(svc_smote,x_train_new,y_train_new)
roc_plot(svc_smote,x_test_new,y_test_new)
svc_smote_nl=SVC(kernel='poly',degree=3,probability=True)
svc_smote_nl=svc_smote_nl.fit(x_train_smote,y_train_smote)
class_report(y_train_new,svc_smote_nl.predict(x_train_new))
class_report(y_test_new,svc_smote_nl.predict(x_test_new))
confusionmatrix(y_train_new,svc_smote_nl.predict(x_train_new))
confusionmatrix(y_test_new,svc_smote_nl.predict(x_test_new))
roc_plot(svc_smote_nl,x_train_new,y_train_new)
roc_plot(svc_smote_nl,x_test_new,y_test_new)
#WITHOUT USING SMOTE
svc_normal=SVC(probability=True)
svc_normal=svc_normal.fit(x_train_new,y_train_new)
class_report(y_test_new,svc_normal.predict(x_test_new))
confusionmatrix(y_test_new,svc_normal.predict(x_test_new))
class_report(y_train_new,svc_normal.predict(x_train_new))
confusionmatrix(y_train_new,svc_normal.predict(x_train_new))
roc_plot(svc_normal,x_test_new,y_test_new)
roc_plot(svc_normal,x_train_new,y_train_new)
from sklearn.naive_bayes import GaussianNB
gnb=GaussianNB()
gnb=gnb.fit(x_train_smote,y_train_smote)
class_report(y_train_new,gnb.predict(x_train_new))
confusionmatrix(y_train_new,gnb.predict(x_train_new))
class_report(y_test_new,gnb.predict(x_test_new))
confusionmatrix(y_test_new,gnb.predict(x_test_new))
roc_plot(gnb,x_train_new,y_train_new)
roc_plot(gnb,x_test_new,y_test_new)
gnb_normal=GaussianNB()
gnb_normal=gnb_normal.fit(x_train_new,y_train_new)
class_report(y_test_new,gnb_normal.predict(x_test_new))
confusionmatrix(y_test_new,gnb_normal.predict(x_test_new))
class_report(y_train_new,gnb_normal.predict(x_train_new))
confusionmatrix(y_train_new,gnb_normal.predict(x_train_new))
roc_plot(gnb_normal,x_test_new,y_test_new)
roc_plot(gnb_normal,x_train_new,y_train_new)
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=1000,
criterion='gini',
max_depth=None,
min_samples_split=100,
min_samples_leaf=50,
min_weight_fraction_leaf=0.0,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
bootstrap=True,
oob_score=False,
n_jobs=None,
random_state=0,
verbose=0,
warm_start=False,
class_weight=None,
ccp_alpha=0.0,
max_samples=None,)
rfc=rfc.fit(x_train_smote,y_train_smote)
class_report(y_train_new,rfc.predict(x_train_new))
class_report(y_test_new,rfc.predict(x_test_new))
from sklearn.model_selection import GridSearchCV
param_grid = {
'min_samples_split' : [50,70,100],
'min_samples_leaf':[40,50,60],
'max_depth':[10,15,20],
'random_state' : [0]
}
RF_model=RandomForestClassifier()
grid_search=GridSearchCV(estimator=RF_model,param_grid=param_grid,cv=10)
grid_search.fit(x_train_smote,y_train_smote)
grid_search.best_estimator_
RF_model=RandomForestClassifier(max_depth=15, min_samples_leaf=40, min_samples_split=50,
random_state=0)
RF_model.fit(x_train_smote, y_train_smote)
## Performance Matrix on train data set
y_train_predict = RF_model.predict(x_train_new)
model_score =RF_model.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
print('Scores and Curve for Training data is ')
roc_plot(RF_model,x_train_new,y_train_new)
## Performance Matrix on test data set
y_test_predict = RF_model.predict(x_test_new)
model_score = RF_model.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test_new, y_test_predict)
class_report(y_test_new, y_test_predict)
print('Scores and Curve for Testing data is ')
roc_plot(RF_model,x_test_new,y_test_new)
pd.DataFrame(100*(RF_model.feature_importances_),x_train_new.columns)
plt.figure(figsize=(10,8))
sns.barplot(y=x_train_new.columns,x=100*(RF_model.feature_importances_))
RF_model_normal=RandomForestClassifier(max_depth=15, min_samples_leaf=40, min_samples_split=50,
random_state=0)
RF_model_normal=RF_model_normal.fit(x_train_new,y_train_new)
class_report(y_train_new,RF_model_normal.predict(x_train_new))
confusionmatrix(y_train_new,RF_model_normal.predict(x_train_new))
class_report(y_test_new,RF_model_normal.predict(x_test_new))
confusionmatrix(y_test_new,RF_model_normal.predict(x_test_new))
roc_plot(RF_model_normal,x_test_new,y_test_new)
roc_plot(RF_model_normal,x_train_new,y_train_new)
from sklearn.ensemble import AdaBoostClassifier
param_grid = {
'n_estimators' : [100,500,1000],
'learning_rate' : [0.1,0.01,0.001],
'algorithm' : ['SAMME', 'SAMME.R']
}
ADB_model=AdaBoostClassifier()
grid_search=GridSearchCV(estimator=ADB_model,param_grid=param_grid)
grid_search.fit(x_train_smote,y_train_smote)
grid_search.best_estimator_
ADB_model=AdaBoostClassifier(learning_rate=0.1, n_estimators=1000)
ADB_model
ADB_model.fit(x_train_new,y_train_new)
## Performance Matrix on train data set
y_train_predict = ADB_model.predict(x_train_new)
model_score = ADB_model.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
## Performance Matrix on test data set
y_train_predict = ADB_model.predict(x_test_new)
model_score = ADB_model.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test_new, ADB_model.predict(x_test_new))
class_report(y_test_new, ADB_model.predict(x_test_new))
roc_plot(ADB_model,x_train_new,y_train_new)
roc_plot(ADB_model,x_test_new,y_test_new)
pd.DataFrame(100*(ADB_model.feature_importances_),x_train_new.columns)
ADB_model_normal=AdaBoostClassifier(learning_rate=0.1, n_estimators=1000)
ADB_model_normal=ADB_model_normal.fit(x_train_new,y_train_new)
class_report(y_train_new,RF_model_normal.predict(x_train_new))
confusionmatrix(y_train_new,RF_model_normal.predict(x_train_new))
class_report(y_test_new,ADB_model_normal.predict(x_test_new))
confusionmatrix(y_test_new,ADB_model_normal.predict(x_test_new))
roc_plot(ADB_model_normal,x_train_new,y_train_new)
roc_plot(ADB_model_normal,x_test_new,y_test_new)
y_train_new.value_counts(normalize=True)
from sklearn.neighbors import KNeighborsClassifier
KNN=KNeighborsClassifier()
KNN=KNN.fit(x_train_smote,y_train_smote)
## Performance Matrix on train data set
y_train_predict = KNN.predict(x_train_new)
model_score = KNN.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
## Performance Matrix on test data set
y_train_predict = KNN.predict(x_test_new)
model_score = KNN.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test_new, KNN.predict(x_test_new))
class_report(y_test_new, KNN.predict(x_test_new))
roc_plot(KNN,x_train_new,y_train_new)
roc_plot(KNN,x_test_new,y_test_new)
KNN_normal=KNeighborsClassifier()
KNN_normal=KNN_normal.fit(x_train_new,y_train_new)
## Performance Matrix on train data set
y_train_predict = KNN_normal.predict(x_train_new)
model_score = KNN_normal.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
## Performance Matrix on test data set
y_train_predict = KNN_normal.predict(x_test_new)
model_score = KNN_normal.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test, KNN_normal.predict(x_test_new))
class_report(y_test, KNN_normal.predict(x_test_new))
roc_plot(KNN_normal,x_train_new,y_train_new)
roc_plot(KNN_normal,x_test_new,y_test_new)
import xgboost as xgb
param_test1 = {
'max_depth':range(3,10,2),
'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(x_train_smote, y_train_smote)
gsearch1.best_params_,gsearch1.best_score_
for i in range(8,12,1):
print(i)
param_test2 = {
'max_depth':range(8,12,1),
'min_child_weight':range(1,4,1)
}
gsearch2 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=9,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(x_train_smote, y_train_smote)
gsearch2.best_params_,gsearch2.best_score_
param_test3 = {
'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=11,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(x_train_smote, y_train_smote)
gsearch3.best_params_,gsearch3.best_score_
xgb2 = xgb.XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=11,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
xgb2.fit(x_train_smote, y_train_smote)
class_report(y_train_new,xgb2.predict(x_train_new))
class_report(y_test_new,xgb2.predict(x_test_new))
param_test4 = {
'subsample':[i/10.0 for i in range(6,10)],
'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(x_train_smote, y_train_smote)
gsearch4.best_params_,gsearch4.best_score_
param_test5 = {
'subsample':[i/100 for i in range(75,90,5)],
'colsample_bytree':[i/10.0 for i in range(4,7)]
}
gsearch5 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.6,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(x_train_smote, y_train_smote)
gsearch5.best_params_,gsearch5.best_score_
param_test6 = {
'subsample':[i/100 for i in range(80,90,2)],
'colsample_bytree':[i/100 for i in range(58,65,2)]
}
gsearch6 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.6,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch6.fit(x_train_smote, y_train_smote)
gsearch6.best_params_,gsearch6.best_score_
param_test7 = {
'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch7= GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.6,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
param_grid = param_test7, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch7.fit(x_train_smote, y_train_smote)
gsearch7.best_params_,gsearch7.best_score_
param_test8 = {
'reg_alpha':[0.001,0.002,0.003,0.004 ]
}
gsearch8= GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=1e-05,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.6,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
param_grid = param_test8, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch8.fit(x_train_smote, y_train_smote)
gsearch8.best_params_,gsearch8.best_score_
xgb3=xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=0.001,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.6,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
xgb3=xgb3.fit(x_train_smote,y_train_smote)
class_report(y_test_new,xgb3.predict(x_test_new))
class_report(y_train_new,xgb3.predict(x_train_new))
param_test9 = {
'learning_rate':[0.01,0.02,0.1]
}
gsearch9= GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=1e-05,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.6,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
param_grid = param_test9, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch9.fit(x_train_smote, y_train_smote)
gsearch9.best_params_,gsearch9.best_score_
xgb4=xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=1e-05,booster='gbtree',
min_child_weight=44, gamma=0, subsample=0.8, colsample_bytree=0.6,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
xgb4=xgb4.fit(x_train_smote, y_train_smote)
class_report(y_train_new,xgb4.predict(x_train_new))
class_report(y_test_new,xgb4.predict(x_test_new))
confusionmatrix(y_train_new,xgb4.predict(x_train_new))
confusionmatrix(y_test_new,xgb4.predict(x_test_new))
print('Scores and Curve for Testing data is ')
roc_plot(xgb4,x_test_new,y_test_new)
print('Scores and Curve for Training data is ')
roc_plot(xgb4,x_train_new,y_train_new)
feature_importance=xgb4.get_booster().get_score(importance_type="gain")
feature_col = list(feature_importance.keys())
feature_value =list(feature_importance.values())
[feature_value]
plt.figure(figsize=(10,5))
sns.barplot(x=100*(xgb4.feature_importances_),y=x_train_smote.columns)
xgb4.get_booster().get_score(importance_type='weight')
xgb4_normal=xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=1e-05,booster='gbtree',
min_child_weight=44, gamma=0, subsample=0.8, colsample_bytree=0.6,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
xgb4_normal=xgb4_normal.fit(x_train_new, y_train_new)
## Performance Matrix on train data set
y_train_predict = xgb4_normal.predict(x_train_new)
model_score = xgb4_normal.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
## Performance Matrix on test data set
y_test_predict = xgb4_normal.predict(x_test_new)
model_score = xgb4_normal.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test, xgb4_normal.predict(x_test_new))
class_report(y_test, xgb4_normal.predict(x_test_new))
roc_plot(xgb4_normal,x_train_new,y_train_new)
roc_plot(xgb4_normal,x_test_new,y_test_new)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda=LinearDiscriminantAnalysis()
lda=lda.fit(x_train_smote, y_train_smote)
## Performance Matrix on train data set
y_train_predict = lda.predict(x_train_new)
model_score = lda.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
## Performance Matrix on test data set
y_train_predict = lda.predict(x_test_new)
model_score = lda.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test, lda.predict(x_test_new))
class_report(y_test, lda.predict(x_test_new))
roc_plot(lda,x_test_new, y_test_new)
roc_plot(lda,x_train_new, y_train_new)
lda_normal=LinearDiscriminantAnalysis()
lda_normal=lda_normal.fit(x_train_new, y_train_new)
## Performance Matrix on train data set
y_train_predict = lda_normal.predict(x_train_new)
model_score = lda_normal.score(x_train_new, y_train_new)
print(model_score)
confusionmatrix(y_train_new, y_train_predict)
class_report(y_train_new, y_train_predict)
## Performance Matrix on test data set
y_test_predict = lda_normal.predict(x_test_new)
model_score = lda_normal.score(x_test_new, y_test_new)
print(model_score)
confusionmatrix(y_test, lda_normal.predict(x_test_new))
class_report(y_test, lda_normal.predict(x_test_new))
roc_plot(lda_normal,x_train_new, y_train_new)
roc_plot(lda_normal,x_test_new, y_test_new)
df_2.head()
df4=df_2.copy()
from sklearn.model_selection import train_test_split
x=df4.drop(['FORECLOSURE'],axis=1)
y=df4.pop('FORECLOSURE')
x_new=x.copy()
y_new=y.copy()
x_train_new,x_test_new,y_train_new,y_test_new=train_test_split(x_new,y_new,test_size=0.3,random_state=0)
#/**x_new_copy=pd.DataFrame()
#x_new_copy['LAP']=x_train_new['LAP']
#x_new_copy['STHL']=x_train_new['STHL']
#x_new_copy['STLAP']=x_train_new['STLAP']
#x_new_copy.head(2)
#x_new_copy.reset_index(drop=True,inplace=True)''''''**/
### Z score Transformation
from sklearn.preprocessing import StandardScaler
x_train_new.drop(['const'],axis=1,inplace=True)
x_test_new.drop(['const'],axis=1,inplace=True)
x_train_new.shape
list_col=['COMPLETED_TENURE', 'DIFF_CURRENT_INTEREST_RATE_MAX_MIN', \
'DIFF_ORIGINAL_CURRENT_INTEREST_RATE', 'EMI_DUEAMT', \
'LATEST_TRANSACTION_MONTH', 'Mean_Interest_Rate', 'PAID_AMOUNT', 'LAP', \
'STLAP']
for col in x_train_new.columns:
if col not in list_col:
x_train_new.drop([col],axis=1,inplace=True)
x_test_new.drop([col],axis=1,inplace=True)
print(x_train_new.shape)
print(x_test_new.shape)
x_train_new.head(2)
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_train_new=pd.DataFrame(ss.fit_transform(x_train_new),columns = x_train_new.columns)
x_train_new.head()
x_test_new.head()
x_train_smote.head()
RF_model.feature_importances_.T
feature_important=pd.DataFrame(columns=x_train_smote.columns)
i=0
for col in feature_important.columns:
feature_important.loc[0,col]=100* RF_model.feature_importances_[i]
i = i+1
for col in feature_important.columns:
print(col)
#feature_important.loc[0,'BALANCE_TENURE']=RF_model.feature_importances_[0]
feature_important.columns
feature_important.shape
[feature_important.drop([col],axis=1,inplace=True) for col in feature_important.columns if feature_important.loc[0,col] < 3]
x_train_new_copy=x_train_new.copy()
x_test_new_copy=x_test_new.copy()
x_train_smote_copy=x_train_smote.copy()
list_col=['COMPLETED_TENURE', 'DIFF_CURRENT_INTEREST_RATE_MAX_MIN', \
'DIFF_ORIGINAL_CURRENT_INTEREST_RATE', 'EMI_DUEAMT', \
'LATEST_TRANSACTION_MONTH', 'Mean_Interest_Rate', 'PAID_AMOUNT', 'LAP', \
'STLAP']
for col in x_train_new_copy.columns:
if col not in list_col:
x_train_new_copy.drop([col],axis=1,inplace=True)
x_test_new_copy.drop([col],axis=1,inplace=True)
x_train_smote_copy.drop([col],axis=1,inplace=True)
df_2[['COMPLETED_TENURE', 'DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE', 'EMI_DUEAMT', \
'LATEST_TRANSACTION_MONTH', 'Mean_Interest_Rate', 'PAID_AMOUNT', 'LAP','STLAP','FORECLOSURE']].head()
from sklearn.model_selection import GridSearchCV
param_grid = {
'min_samples_split' : [40,50,55],
'min_samples_leaf':[30,40,45],
'max_depth':[10,15,20],
'random_state' : [0]
}
rf_deploy=RandomForestClassifier()
grid_search_deploy=GridSearchCV(estimator=rf_deploy,param_grid=param_grid,cv=10)
grid_search_deploy.fit(x_train_smote_copy,y_train_smote)
grid_search_deploy.best_estimator_
ss.inverse_transform
rf_deploy=RandomForestClassifier(max_depth=15, min_samples_leaf=40, min_samples_split=50,
random_state=0)
rf_deploy=rf_deploy.fit(x_train_smote_copy,y_train_smote)
class_report(y_train_new,rf_deploy.predict(x_train_new_copy))
class_report(y_test_new,rf_deploy.predict(x_test_new_copy))
confusionmatrix(y_train_new,rf_deploy.predict(x_train_new_copy))
confusionmatrix(y_test_new,rf_deploy.predict(x_test_new_copy))
x_test_new_copy.head(1)
df_2.head(2)
x_train_new_copy.head(3)
ss.mean_.shape
a=x_train_new.copy()
a.drop(['LAP','STHL','STLAP'],axis=1,inplace=True)
scaling_features=pd.DataFrame(data=ss.mean_.reshape(1,24) , columns=a.columns)
scaling_features.loc[1]=ss.var_
scaling_features
scaling_features.iloc[1]=np.sqrt(scaling_features.iloc[1])
scaling_features
list_col=['COMPLETED_TENURE', 'DIFF_CURRENT_INTEREST_RATE_MAX_MIN', \
'DIFF_ORIGINAL_CURRENT_INTEREST_RATE', 'EMI_DUEAMT', \
'LATEST_TRANSACTION_MONTH', 'Mean_Interest_Rate', 'PAID_AMOUNT', 'LAP', \
'STLAP']
[scaling_features.drop([col],axis=1,inplace=True) for col in scaling_features.columns if col not in list_col ]
scaling_features.head()
import pickle
ss.mean_
rf_file=open('rf.pkl','wb')
pickle.dump(rf_deploy,rf_file)
loaded_model=pickle.load(open('rf.pkl','rb'))
print(loaded_model.score(x_train_new_copy,y_train_new))
pickle.dump(ss,open('scaler.pkl','wb'))
rf_deploy.feature_importances_.reshape(9,1)
rf_deploy.feature_importances_
type([[0.626971,0.622898,0.6142,0.002305,2.02789,0.869787,0.03947,1,0]])
x_train_new_copy.shape
x_train_smote_copy.head()
RF_model.feature_importances_[0]
pd.DataFrame(100*(RF_model.feature_importances_.T))#.T,columns=x_train_smote.columns
x_train_smote.columns
x_new=x.copy()
y_new=y.copy()
x_new['LAP']=df_2['LAP']
x_new['STHL']=df_2['STHL']
x_new['STLAP']=df_2['STLAP']
#x_new.drop(['LATEST_TRANSACTION_MONTH'],axis=1,inplace=True)
x_train_new,x_test_new,y_train_new,y_test_new=train_test_split(x_new,y_new,test_size=0.3,random_state=0)
#Validation Set
#x_test_1,x_test_final,y_test_1,y_test_final=train_test_split(x_test_new,y_test_new,test_size=0.3,random_state=0)
sm_new = SMOTE(random_state=0)
x_train_new,y_train_new=sm_new.fit_resample(x_train_new,y_train_new)
x_train_new.head()
y_train_new.head()
xgb4_new=xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=1e-05,booster='gbtree',
min_child_weight=44, gamma=0, subsample=0.8, colsample_bytree=0.6,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
xgb4_new=xgb4_new.fit(x_train_new, y_train_new)#44
x_train_new.head()
class_report(y_train_new,xgb4_new.predict(x_train_new))
x_train_new.shape
x_test_new.shape
x_test_new=x_test_new.apply(zscore)
x_test_new.head()
class_report(y_test_new,xgb4_new.predict(x_test_new)) #88 94 91 98 without latest_Transaction
plt.figure(figsize=(10,5))
sns.barplot(x=100*(xgb4_new.feature_importances_),y=x_test_new.columns)
x_new_smote=x_train_smote[['LATEST_TRANSACTION_MONTH','LAP','STHL','STLAP','EMI_DUEAMT','CURRENT_INTEREST_RATE_CHANGES'\
,'DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'Mean_Interest_Rate','DPD','DUEDAY','ORIGNAL_TENOR','PAID_AMOUNT']] #'LATEST_TRANSACTION_MONTH',
x_train_new_1=x_train_new[['LATEST_TRANSACTION_MONTH','LAP','STHL','STLAP','EMI_DUEAMT','CURRENT_INTEREST_RATE_CHANGES'\
,'DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'Mean_Interest_Rate','DPD','DUEDAY','ORIGNAL_TENOR','PAID_AMOUNT']]
x_test_new_1=x_test_new[['LATEST_TRANSACTION_MONTH','LAP','STHL','STLAP','EMI_DUEAMT','CURRENT_INTEREST_RATE_CHANGES'\
,'DIFF_CURRENT_INTEREST_RATE_MAX_MIN','DIFF_ORIGINAL_CURRENT_INTEREST_RATE',\
'Mean_Interest_Rate','DPD','DUEDAY','ORIGNAL_TENOR','PAID_AMOUNT']]#'LATEST_TRANSACTION_MONTH',
x_new_smote.head()
xgb4_new=xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=11,reg_alpha=1e-05,booster='gbtree',
min_child_weight=44, gamma=0, subsample=0.8, colsample_bytree=0.6,
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
xgb4_new=xgb4_new.fit(x_new_smote, y_train_new)
class_report(y_train_new,xgb4_new.predict(x_new_smote))
class_report(y_test_new,xgb4_new.predict(x_test_new_smote)) #83 95 89
confusionmatrix(y_train_new,xgb4_new.predict(x_new_smote))
confusionmatrix(y_test_new,xgb4_new.predict(x_test_new_smote))
plt.figure(figsize=(10,5))
sns.barplot(x=100*(xgb4_new.feature_importances_),y=x_test_new_smote.columns)
print('Scores and Curve for Training data is ')
roc_plot(xgb4_new,x_new_smote,y_train_new)
print('Scores and Curve for Training data is ')
roc_plot(xgb4_new,x_test_new_smote,y_test_new)
x_test.head(2)